# libraries in use
library(tidyverse)
library(dplyr)
library(usmap)
library(ggplot2)
library(maps)
library(mapdata)
library(plotly)
# Benjamin's read and clean function
read_and_clean_county_data <- function(file_path_name, state, outcome) {
read_csv(file_path_name,
skip = 8) %>%
janitor::clean_names() %>%
select(county, age_adjusted_incidence_rate_rate_note_cases_per_100_000) %>%
mutate(county = stringr::str_remove(county, "\\(.\\)$"),
state = state,
outcome = outcome,
age_adjusted_incidence_rate =
as.numeric(age_adjusted_incidence_rate_rate_note_cases_per_100_000)) %>%
select(state, county, outcome, age_adjusted_incidence_rate)
}
Next, I loaded in the county data sets using this function:
#county-level lung cancer PA data https://statecancerprofiles.cancer.gov/
pa_county_lc = read_and_clean_county_data('./data/lung/pa_lung_data_county.csv',
state = "PA",
outcome = "lung cancer") %>%
filter(complete.cases(.)) %>%
slice(-c(1, 2))
## New names:
## * `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...5`
## * `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...6`
## * `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...13`
## * `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...14`
## Rows: 87 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): County, FIPS, Met Healthy People Objective of ***?, CI*Rank([rank n...
## dbl (7): Age-Adjusted Incidence Rate([rate note]) - cases per 100,000, Lower...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: One or more parsing issues, see `problems()` for details
#county-level lung cancer OH data https://statecancerprofiles.cancer.gov/
oh_county_lc = read_and_clean_county_data('./data/lung/oh_lung_data_county.csv',
state = "OH",
outcome = "lung cancer") %>%
filter(complete.cases(.)) %>%
slice(-c(1, 2))
## New names:
## * `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...5`
## * `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...6`
## * `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...13`
## * `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...14`
## Rows: 108 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): County, FIPS, Met Healthy People Objective of ***?, CI*Rank([rank n...
## dbl (7): Age-Adjusted Incidence Rate([rate note]) - cases per 100,000, Lower...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: One or more parsing issues, see `problems()` for details
#county-level lung cancer NY data https://statecancerprofiles.cancer.gov/
ny_county_lc = read_and_clean_county_data('./data/lung/ny_lung_data_county.csv',
state = "NY",
outcome = "lung cancer") %>%
filter(complete.cases(.)) %>%
slice(-c(1, 2))
## New names:
## * `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...5`
## * `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...6`
## * `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...13`
## * `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...14`
## Rows: 82 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): County, FIPS, Met Healthy People Objective of ***?, CI*Rank([rank n...
## dbl (7): Age-Adjusted Incidence Rate([rate note]) - cases per 100,000, Lower...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: One or more parsing issues, see `problems()` for details
Finally, I loaded in the state-level lung cancer data
#state-level lung cancer PA data https://www.phaim1.health.pa.gov/EDD/
pa_cancer = read.csv('./data/lung/pa_state_lung.csv', sep = ";", header = T, skip = 3) %>%
janitor::clean_names() %>%
select(year, rate_ratio_result) %>%
map_df(str_replace, pattern = ",", replacement = ".") %>%
map_df(as.numeric) %>%
rename(c("age_adjusted_incidence_rate" = "rate_ratio_result")) %>%
add_column(state = "PA")
#state-level lung cancer OH data https://publicapps.odh.ohio.gov/EDW/DataBrowser/Browse/StateLayoutLockdownCancers
oh_cancer = read.csv('./data/lung/oh_state_lung.csv', sep = ";") %>%
janitor::clean_names() %>%
rename(year = cancer_year_year) %>%
select(year, age_adjusted_rate) %>%
map_df(str_replace, pattern = ",", replacement = ".") %>%
map_df(as.numeric) %>%
rename(c("age_adjusted_incidence_rate" = "age_adjusted_rate")) %>%
add_column(state = "OH") %>%
distinct() %>%
filter(complete.cases(.))
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
#state-level lung cancer NY data https://www.health.ny.gov/statistics/cancer/registry/table2/tb2lungnys.htm
ny_cancer = read.csv("./data/lung/ny_state_lung.csv", skip = 2)[ ,1:3] %>%
janitor::clean_names() %>%
rename(year = x) %>%
select(year, rate_per_100_000_population) %>%
rename(c("age_adjusted_incidence_rate" = "rate_per_100_000_population")) %>%
add_column(state = "NY")
# merging county level data for lung cancer
fips_codes = read.csv('./data/fips_codes.csv') %>%
janitor::clean_names() %>%
filter(state %in% c("NY", "PA", "OH")) %>%
rename(c("county" = "name"))
fips_codes$county[fips_codes$county == "St Lawrence"] <- "St. Lawrence"
county_lc <- rbind(pa_county_lc, oh_county_lc, ny_county_lc) %>%
mutate(county = (gsub('.{7}$', '', county)))
lc_fips <- merge(county_lc, fips_codes, by = c("state", "county"))
# lung cancer incidence maps by county
ny_map <- plot_usmap(data = lc_fips, values = "age_adjusted_incidence_rate", "counties", include = c("NY"), color = "black") +
labs(title = "New York Age-Adjusted Lung Cancer Rates, 2014-2018") +
scale_fill_continuous(low = "#FFE9C7", high = "#FF0000",
name = "age_adjusted_incidence_rate", label = scales::comma) +
theme(plot.background = element_rect(), legend.position = "right")
oh_map <- plot_usmap(data = lc_fips, values = "age_adjusted_incidence_rate", "counties", include = c("OH"), color = "black") +
labs(title = "Ohio Age-Adjusted Lung Cancer Rates, 2014-2018") +
scale_fill_continuous(low = "#FFE9C7", high = "#FF0000",
name = "age_adjusted_incidence_rate", label = scales::comma) +
theme(plot.background = element_rect(), legend.position = "right")
pa_map <- plot_usmap(data = lc_fips, values = "age_adjusted_incidence_rate", "counties", include = c("PA"), color = "black") +
labs(title = "Pennsylvania Age-Adjusted Lung Cancer Rates, 2014-2018") +
scale_fill_continuous(low = "#FFE9C7", high = "#FF0000",
name = "age_adjusted_incidence_rate", label = scales::comma) +
theme(plot.background = element_rect(), legend.position = "right")
ny_oh_pa_lungcancer <- plot_usmap(data = lc_fips, values = "age_adjusted_incidence_rate", "counties", include = c("NY", "OH", "PA"), color = "black") +
labs(title = "NY, OH, and PA Age-Adjusted Lung Cancer Rates, 2014-2018") +
scale_fill_continuous(low = "#FFE9C7", high = "#FF0000",
name = "age_adjusted_incidence_rate", label = scales::comma) +
theme(plot.background = element_rect(), legend.position = "right")
# merging state level data for lung cancer
state_lc <- rbind(pa_cancer, oh_cancer, ny_cancer)
state_lc_wide = state_lc %>%
pivot_wider(
names_from = state,
values_from = age_adjusted_incidence_rate) %>%
rename(c("PA_AAIR" = "PA", "NY_AAIR" = "NY", "OH_AAIR" = "OH"))
write_csv(state_lc_wide, "./data/state_lc_wide.csv")
# plotly of county level data
fig <- plot_ly(state_lc_wide, x = ~year)
fig <- fig %>% add_lines(y = ~PA_AAIR, name = "Pennsylvania")
fig <- fig %>% add_lines(y = ~NY_AAIR, name = "New York")
fig <- fig %>% add_lines(y = ~OH_AAIR, name = "Ohio")
fig <- fig %>% layout(
title = "Lung Cancer Age-Adjusted Incidence Rates",
xaxis = list(
rangeselector = list(
buttons = list(
list(
count = 1,
label = "1 yr",
step = "year",
stepmode = "backward"),
list(
count = 5,
label = "5 yr",
step = "year",
stepmode = "backward"),
list(
count = 10,
label = "10 yr",
step = "year",
stepmode = "backward"),
list(step = "all"))),
rangeslider = list(type = "year")),
yaxis = list(title = "Age-Adjusted Incidence Rate (per 100,000"))
fig
apuv = read.csv('./ap/ap_uv/apuv.csv')
average_med_aqi =
apuv %>%
group_by(state, year) %>%
summarise_at(vars(pm25_med_pred), list(avg_med_aqi = mean))
lc_aqi_data <- merge(average_med_aqi, state_lc, by = c("year", "state"))
average_med_aqi_wide =
average_med_aqi %>%
pivot_wider(
names_from = state,
values_from = avg_med_aqi) %>%
rename(c("oh_avg_med_aqi" = "OH", "ny_avg_med_aqi" = "NY", "pa_avg_med_aqi" = "PA")) %>%
subset(select = -c(ME))
write_csv(lc_aqi_data, "./data/lc_aqi_data.csv")
fig1 <- lc_aqi_data %>%
plot_ly(
x = ~age_adjusted_incidence,
y = ~y,
frame = ~f,
type = 'scatter',
mode = 'markers',
showlegend = F
)
fig1 <- lc_aqi_data %>%
plot_ly(
x = ~avg_med_aqi,
y = ~age_adjusted_incidence_rate,
color = ~state,
frame = ~year,
text = ~state,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
)
fig1 <- fig1 %>% layout(
title = "Lung Cancer AAIR and Average Median AQI",
xaxis = list(title = "Average Median AQI",
type = "log"
),
yaxis = list(title = "Age Adjusted Incidence Rate")
)
fig1
ny_fips = lc_fips %>%
filter(state == "NY")
pa_fips = lc_fips %>%
filter(state == "PA")
oh_fips = lc_fips %>%
filter(state == "OH")
library(rjson)
library(plotly)
url <- 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'
counties <- rjson::fromJSON(file = url)
county_map_lc <- plot_ly()
county_map_lc <- county_map_lc %>%
add_trace(
type = "choroplethmapbox",
geojson = counties,
locations = lc_fips$fips,
z = lc_fips$age_adjusted_incidence_rate,
text = lc_fips$county,
colorscale = "Viridis",
marker = list(line = list(
width = 0),
opacity = 0.5)
)
county_map_lc <- county_map_lc %>%
layout(
title = "Age-Adjusted Lung Cancer Incidence Rates (2014-2018)",
mapbox = list(
style = "carto-positron",
zoom = 4,
center = list(lon = -77.215135, lat = 41.164818)
)
)
county_map_lc
lm_lc_aqi = lm(age_adjusted_incidence_rate ~ avg_med_aqi, data = lc_aqi_data)
summary(lm_lc_aqi)
##
## Call:
## lm(formula = age_adjusted_incidence_rate ~ avg_med_aqi, data = lc_aqi_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3391 -2.0145 0.2906 1.6527 5.6890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.6447 1.8196 24.54 <2e-16 ***
## avg_med_aqi 2.3193 0.1776 13.06 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.465 on 46 degrees of freedom
## Multiple R-squared: 0.7875, Adjusted R-squared: 0.7829
## F-statistic: 170.5 on 1 and 46 DF, p-value: < 2.2e-16
plot(lc_aqi_data$age_adjusted_incidence_rate, lc_aqi_data$avg_med_aqi)

lm_lc_aqi = lm(age_adjusted_incidence_rate ~ avg_med_aqi, data = lc_aqi_data)
lc_aqi_data %>%
plot_ly(x = ~avg_med_aqi) %>%
add_markers(y = ~age_adjusted_incidence_rate, name = "AQI") %>%
add_lines(x = ~avg_med_aqi, y = fitted(lm_lc_aqi), name = "Regression Fit")
write_csv(lc_fips, "./data/lung/lc_fips.csv")
write_csv(county_lc, "./data/lung/county_lc.csv")
state_lc_data = state_lc %>%
mutate(outcome = "lung cancer")
write_csv(state_lc_data, "./data/lung/state_lc_data.csv")